library(readr)
## Warning: package 'readr' was built under R version 4.2.2
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.4.1 
## ✔ tidyr   1.2.1      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)

library(tidytext)
library(textdata)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.2.2
## Loading required package: RColorBrewer
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.2.2
library(RColorBrewer)
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.2.2
library(readr)
UFO_and_Weather <- read_csv("UFO_and_Weather.csv", 
    col_types = cols(month = col_character(), 
        hour = col_time(format = "%H")))
## New names:
## • `` -> `...1`
View(UFO_and_Weather)

#new eda goals

Text analysis

using what we did before for text analysis

First, lets make a word frequency

#Step 1:tokenize corpus

words <- UFO_and_Weather %>%
  select(text) %>%
  unnest_tokens(word, text)

head(words)
## # A tibble: 6 Ă— 1
##   word     
##   <chr>    
## 1 my       
## 2 wife     
## 3 was      
## 4 driving  
## 5 southeast
## 6 on
#Now, we'll generate a count of the words, sort by the number of times the word occurs, and then plot the top 15 words in a bar plot

#x= season and poem=line

words %>% count(word, sort = T) %>% slice(1:15) %>%
  ggplot(aes(x = reorder(word, n, function(n) -n), y = n)) +
  geom_bar(stat = 'identity') +
  theme_light() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
  xlab("Words") +
  ggtitle(" Word Count (with stop words)")

#as we can see, the most popular words (at the moment) are stop words. This isn't very helpful for our analysis, so we'll take them out

now lets create stop words

#Step 2: Using the `TidyText` package, remove stop words and generate a new word count
ufo_no_stop <- words %>% 
  anti_join(stop_words)
## Joining, by = "word"
ufo_no_stop %>%
  count(word, sort = T) %>% 
  slice(1:15) %>% 
  ggplot(aes(x = reorder(word, n, function(n) -n), y = n)) + 
  geom_bar(stat = "identity") + 
  theme_light() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  xlab("Words") +
  ggtitle("Word Frequency without Stop Words") 

We can see the most common words are those typical for a UFO report. Light, sky, object, moving, and looked all make sense here.

making a word cloud

First making a term-document matrix. Following tutorial from here: http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know

Document matrix is a table containing the frequency of the words. Column names are words and row names are documents.

#Build a term-document matrix

dtm <- TermDocumentMatrix(ufo_no_stop)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
##                word  freq
## "light",   "light", 24545
## "lights", "lights", 23581
## "sky",       "sky", 23153
## "object", "object", 17931
## "bright", "bright", 13832
## "moving", "moving", 11887
## "white",   "white",  9945
## "looked", "looked",  9287
## "red",       "red",  8820
## "time",     "time",  7510
#generate word cloud
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"),  scale=c(3.5,0.25))

#was getting error about word cloud cropping certain words, added the argument scale=c(3.5,0.25)

#playing with wordcloud2
wordcloud2(data=d, size = 0.5, shape = 'star')
wordcloud2(data=d, size=1.6, color='random-dark')
wordcloud2(d, color = "random-light", backgroundColor = "grey")
#making the word ufo with the word cloud
letterCloud(d, word = "UFO", wordSize = 2)
#ok lets try with the shape of a ufo now

# figPath = system.file("ufo.png",package = "wordcloud2")
# 
# wordcloud2(d, figPath = figPath, size = 1.5,color = "skyblue")
# 
# 
# figPath = system.file("t.png",package = "wordcloud2")
# wordcloud2(d, figPath = figPath, size = 1.5,color = "red")
# 
# 
# # Change the shape using your image
# wordcloud2(d, figPath = "t.png", size = 1.5, color = "red", backgroundColor="gray")
#findFreqTerms(dtm, lowfreq = 4)

findAssocs(dtm, terms = "lights", corlimit = 0.3)
## $lights
## numeric(0)
#frequency table of words
head(d, 10)
##                word  freq
## "light",   "light", 24545
## "lights", "lights", 23581
## "sky",       "sky", 23153
## "object", "object", 17931
## "bright", "bright", 13832
## "moving", "moving", 11887
## "white",   "white",  9945
## "looked", "looked",  9287
## "red",       "red",  8820
## "time",     "time",  7510

This gives us a quantitative measure of how much these words appear in the UFO summaries.

text analysis w/o nodes

#making a df without the node observations

df2 <- UFO_and_Weather %>% filter(!grepl("MADAR", text))

words <- df2 %>%
  select(text) %>%
  unnest_tokens(word, text)

head(words)
## # A tibble: 6 Ă— 1
##   word     
##   <chr>    
## 1 my       
## 2 wife     
## 3 was      
## 4 driving  
## 5 southeast
## 6 on
#stop words
ufo_no_stop2 <- words %>% 
  anti_join(stop_words)
## Joining, by = "word"
#Build a term-document matrix

dtm1 <- TermDocumentMatrix(ufo_no_stop2)
m1 <- as.matrix(dtm1)
v1 <- sort(rowSums(m1),decreasing=TRUE)
d1 <- data.frame(word = names(v1),freq=v1)
head(d1, 10)
##                word  freq
## "light",   "light", 24541
## "lights", "lights", 23580
## "sky",       "sky", 23153
## "object", "object", 17928
## "bright", "bright", 13830
## "moving", "moving", 11885
## "white",   "white",  9940
## "looked", "looked",  9287
## "red",       "red",  8819
## "time",     "time",  7506
#generate word cloud
set.seed(123)
wordcloud(words = d1$word, freq = d1$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"),  scale=c(3.5,0.25))

#playing with wordcloud2
wordcloud2(data=d1, size = 0.5, shape = 'star')
wordcloud2(data=d1, size=1.6, color='random-dark')
wordcloud2(d1, color = "random-light", backgroundColor = "grey")

Investigate with and w/o July 4th outlier

#going to take out july 4th from all years to investigate how that influences the data set
UFO_no_july4 <- UFO_and_Weather

#first making date_time variable into just month and day
UFO_no_july4$date <- format(as.Date(UFO_no_july4$date_time), "%m-%d")

#now taking out july 4th
#UFO_no_july4 %>% filter(date != "07-04")

UFO_no_july4 %>% filter(!grepl("07-04", date))
## # A tibble: 22,123 Ă— 19
##     ...1 city  state date_time           shape text  city_…¹ city_…²  year month
##    <dbl> <chr> <chr> <dttm>              <chr> <chr>   <dbl>   <dbl> <dbl> <chr>
##  1     0 Ches… VA    2019-12-12 18:43:00 light My w…    37.3   -77.4  2019 12   
##  2     1 Rock… CT    2019-03-22 18:30:00 circ… I th…    41.7   -72.6  2019 3    
##  3     2 Otta… ON    2019-04-17 02:00:00 tear… I wa…    45.4   -75.7  2019 4    
##  4     3 Kirb… TX    2019-04-02 20:25:00 disk  The …    30.7   -94.0  2019 4    
##  5     4 Tucs… AZ    2019-05-01 11:00:00 unkn… Desc…    32.3  -111.   2019 5    
##  6     5 Gold… AZ    2019-04-10 17:00:00 circ… Apr.…    33.4  -111.   2019 4    
##  7     6 Broo… IN    2019-06-18 21:00:00 sphe… Meta…    39.4   -85.0  2019 6    
##  8     7 Melb… FL    2019-06-12 22:00:00 unkn… We t…    28.0   -80.5  2019 6    
##  9     8 Carr… NM    2019-06-11 22:00:00 chan… I wa…    33.8  -106.   2019 6    
## 10     9 Waco  TX    2018-06-15 01:00:00 circ… I wa…    31.6   -97.1  2018 6    
## # … with 22,113 more rows, 9 more variables: day <dbl>, hour <time>,
## #   temperature <dbl>, relative_humidity <dbl>, precipitation <dbl>,
## #   snow <lgl>, wind_direction <dbl>, wind_speed <dbl>, date <chr>, and
## #   abbreviated variable names ¹​city_latitude, ²​city_longitude

ok lets see how that worked

ggplot(UFO_no_july4, aes(x=reorder(month, month, FUN=length)))+
  geom_bar()+
  coord_flip()

just_july1 <- UFO_no_july4 %>%
  filter(month== 7)
ggplot(just_july1, aes(x=reorder(day, day, FUN=length)))+
  geom_bar()+
  coord_flip()

Look at timing of sightings on July 4th?